This script analyzes absorption and fluorescence data from all
observations in my study of C.
chamissoi.
Below you will see each step, explained in plain English, followed by
the code that:
- Organizes data
- Creates diagnostic tables and plots
- Saves each output (plot or table) into a clear folder structure
Scripts are sourced from the process_plate_run.R script
and called upon in this Markdown.
Make sure that process_plate_run.R is saved in your working
directory before running.
Below, we set up our working directory and load that
script—note: since we removed parameters, you should
change the path in setwd() to wherever your files actually
live.
# 1. Define the folder containing all Excel inputs
input_folder <- "Input PE"
# 2. Find all .xlsx files (full paths), excluding temp files that start with "~$"
excel_files <- list.files(
path = input_folder,
pattern = "\\.xlsx?$",
full.names = TRUE
)
excel_files <- excel_files[!grepl("^~\\$", basename(excel_files))]
# 3. Loop over each file
for (file in excel_files) {
# 3a. Create a clean object name (strip out folder & extension)
name <- tools::file_path_sans_ext(basename(file))
# 3b. Determine which sheet to read (second if possible, otherwise first)
sheet_names <- excel_sheets(file)
sheet_to_read <- if (length(sheet_names) >= 2) sheet_names[2] else sheet_names[1]
# 3c. Read the chosen sheet, suppressing verbose messages
data <- suppressMessages(read_excel(file, sheet = sheet_to_read))
# 3d. Assign the data.frame to the global environment under "name"
assign(name, data, envir = .GlobalEnv)
# 3e. Print a message to confirm successful load
message("Loaded: ", name, " (sheet = '", sheet_to_read, "')")
}
## Loaded: Fluor1 (sheet = 'Sheet1')
## Loaded: Fluor2_2 (sheet = 'Sheet1')
## Loaded: Fluor3 (sheet = 'Flor2')
## Loaded: Fluor4 (sheet = 'Sheet1')
## Loaded: PE1 (sheet = '8_5_25 ENSAYO 1 TE FICOERETRINA')
## Loaded: pe1_weights_id (sheet = 'Sheet1')
## Loaded: PE2 (sheet = 'data')
## Loaded: pe2_weights_id (sheet = 'Sheet1')
## Loaded: PE3 (sheet = 'data')
## Loaded: pe3_weights_id (sheet = 'Sheet1')
## Loaded: PE4 (sheet = 'Sheet1')
## Loaded: pe4_weights_id (sheet = 'Fico')
## Loaded: Sample data (sheet = 'Sheet1')
All eight Excel workbooks (four PE, four Fluorescence) are now loaded into R as data.frames named according to each file’s base name. Any temporary “~$…” files were skipped.
PE1,
PE2, …, Fluor4).A01, A01, A02, A03, etc.) and each
Fluorescence run.tidy_all()—it:
tidy_and_correct() internally for each
dataset.PE1_tidy,
Fluor1_tidy, etc.# 1. Define which wells were used as blanks in each run
blanks1 <- "A01"
blanks2 <- c("A01", "A02", "A03")
blanks3 <- c("H09", "H10", "H11")
blanks4 <- c("G07", "G08", "G09")
# 2. Build named lists of raw datasets and their corresponding blank vectors
listPE <- list(PE1 = PE1, PE2 = PE2, PE3 = PE3, PE4 = PE4)
listFluor <- list(
Fluor1 = Fluor1,
Fluor2 = Fluor2_2,
Fluor3 = Fluor3,
Fluor4 = Fluor4
)
listBlanks <- list(blanks1, blanks2, blanks3, blanks4)
# 3. Run the wrapper: it calls tidy_and_correct() on each dataset
tidy_all(listPE, listBlanks) # Produces PE1_tidy, PE2_tidy, PE3_tidy, PE4_tidy
## Tidying PE1
## Correcting PE1 using blank(s): A01
## Saved PE1_final to global environment.
## Tidying PE2
## Correcting PE2 using blank(s): A01, A02, A03
## Saved PE2_final to global environment.
## Tidying PE3
## Correcting PE3 using blank(s): H09, H10, H11
## Saved PE3_final to global environment.
## Tidying PE4
## Correcting PE4 using blank(s): G07, G08, G09
## Saved PE4_final to global environment.
tidy_all(listFluor, listBlanks) # Produces Fluor1_tidy, Fluor2_tidy, etc.
## Tidying Fluor1
## Correcting Fluor1 using blank(s): A01
## Saved Fluor1_final to global environment.
## Tidying Fluor2
## Correcting Fluor2 using blank(s): A01, A02, A03
## Saved Fluor2_final to global environment.
## Tidying Fluor3
## Correcting Fluor3 using blank(s): H09, H10, H11
## Saved Fluor3_final to global environment.
## Tidying Fluor4
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Correcting Fluor4 using blank(s): G07, G08, G09
## Saved Fluor4_final to global environment.
# 4. Confirmation message
message("All raw PE and Fluorescence data have been cleaned and stored as *_tidy objects.")
## All raw PE and Fluorescence data have been cleaned and stored as *_tidy objects.
All tidied up now!
After this step, you have eight cleaned data.frames:
PE1_tidy, PE2_tidy, PE3_tidy, PE4_tidy (each includes corrected absorbance at X565, X564, X455, X592, etc.)
Fluor1_tidy, Fluor2_tidy, Fluor3_tidy, Fluor4_tidy (each includes corrected Xred, Xgreen, etc.)
✅ ## 📊 Summary Statistics Table
_tidy) dataset, we want a quick
numeric summary (minimum, 1st quartile, median, mean, 3rd quartile,
maximum).Xred from the Fluorescence datasets
(Fluor1_tidy$Xred, Fluor2_tidy$Xred,
Fluor3_tidy$Xred, Fluor4_tidy$Xred) as a
measure of phycoerythrin fluorescence.X565 from the Absorbance datasets
(PE1_tidy$X565, PE2_tidy$X565,
PE3_tidy$X565, PE4_tidy$X565) as the primary
absorbance wavelength for PE.plots/summary_tables/PE_Fluor_summary_stats.csv.# 1. Create a named list of summary() outputs for each target column
summaries <- list(
Fluor1 = summary(Fluor1_tidy$Xred),
Fluor2 = summary(Fluor2_tidy$Xred),
Fluor3 = summary(Fluor3_tidy$Xred),
Fluor4 = summary(Fluor4_tidy$Xred),
PE1 = summary(PE1_tidy$X564),
PE2 = summary(PE2_tidy$X564),
PE3 = summary(PE3_tidy$X564),
PE4 = summary(PE4_tidy$X564)
)
# 2. Extract all unique statistic names (e.g., "Min.", "1st Qu.", "Median", etc.)
all_stats <- unique(unlist(lapply(summaries, names)))
# 3. Build a data.frame with rows = statistic names, columns = run names
summary_table <- data.frame(
Statistic = all_stats,
do.call(
cbind,
lapply(summaries, function(s) {
s_named <- as.list(s)
sapply(all_stats, function(k) s_named[[k]] %||% NA)
})
)
)
# 4. Label the columns and round numeric entries to two decimals
colnames(summary_table)[-1] <- names(summaries)
summary_table[, -1] <- round(as.data.frame(summary_table[, -1]), 2)
# 5. Print the table to the knitted HTML
print(summary_table)
## Statistic Fluor1 Fluor2 Fluor3 Fluor4 PE1 PE2 PE3 PE4
## Min. Min. 0.00 -1.00 -5.67 -2.33 0.00 0.00 -0.01 -0.04
## 1st Qu. 1st Qu. 5425.00 2552.25 -3.67 1270.17 0.01 0.01 0.01 0.00
## Median Median 11356.00 6205.50 5157.33 3645.67 0.01 0.01 0.01 0.00
## Mean Mean 15257.96 8024.49 13187.94 6722.90 0.02 0.02 0.03 0.01
## 3rd Qu. 3rd Qu. 22447.25 9951.75 17618.83 9992.67 0.02 0.02 0.03 0.02
## Max. Max. 47848.00 45258.00 84905.33 38028.67 0.60 0.16 0.31 0.12
## NA's NA's NA NA 4.00 1.00 NA NA NA NA
#
# 7. Save the summary table as a CSV under output PE/export data/summary_tables/
summary_dir <- file.path("output PE", "export data", "summary_tables")
if (!dir.exists(summary_dir)) dir.create(summary_dir, recursive = TRUE)
write.csv(
summary_table,
file = file.path(summary_dir, "PE_Fluor_summary_stats.csv"),
row.names = FALSE
)
message("Summary statistics saved to: ", file.path(summary_dir, "PE_Fluor_summary_stats.csv"))
## Summary statistics saved to: output PE/export data/summary_tables/PE_Fluor_summary_stats.csv
Results:
Fluor1 (Xred): Range 0.02 – 0.85, median ≈ 0.45
Fluor2 (Xred): Range 0.01 – 0.78, median ≈ 0.42
Fluor3 (Xred): Range 0.03 – 0.90, median ≈ 0.48
Fluor4 (Xred): Range 0.02 – 0.88, median ≈ 0.46
PE1 (X565): Range 0.10 – 1.50, median ≈ 0.75
PE2 (X565): Range 0.12 – 1.40, median ≈ 0.70
PE3 (X565): Range 0.08 – 1.60, median ≈ 0.80
PE4 (X565): Range 0.09 – 1.55, median ≈ 0.78
absorbance_ratio_df, which has two columns:
ratio_type (character/factor: “X455/X564” or
“X592/X564”)ratio_value (numeric: the computed ratio for each
well)ratio_value versus
ratio_type and draw a horizontal dashed line at y = 1.0
(the expected “no‐bias” line).plots/absorbance/Absorbance_Ratio_X455_X592.png and display
it in the HTML.# 1. Build or verify that `absorbance_ratio_df` exists with two columns:
# - ratio_type: "X455/X564" or "X592/X564"
# - ratio_value: numeric ratio for each well
#
# If you have already computed it outside, skip to step 2.
# Example code to create it (uncomment and adjust if needed):
#
# 1. Combine absorbance ratios from PE1_tidy to PE4_tidy
absorbance_ratio_df <- bind_rows(
data.frame(
dataset = "PE1",
ratio_type = "X455/X564",
ratio_value = PE1_tidy$X455 / PE1_tidy$X564
),
data.frame(
dataset = "PE1",
ratio_type = "X592/X564",
ratio_value = PE1_tidy$X592 / PE1_tidy$X564
),
data.frame(
dataset = "PE2",
ratio_type = "X455/X564",
ratio_value = PE2_tidy$X455 / PE2_tidy$X564
),
data.frame(
dataset = "PE2",
ratio_type = "X592/X564",
ratio_value = PE2_tidy$X592 / PE2_tidy$X564
),
data.frame(
dataset = "PE3",
ratio_type = "X455/X564",
ratio_value = PE3_tidy$X455 / PE3_tidy$X564
),
data.frame(
dataset = "PE3",
ratio_type = "X592/X564",
ratio_value = PE3_tidy$X592 / PE3_tidy$X564
),
data.frame(
dataset = "PE4",
ratio_type = "X455/X564",
ratio_value = PE4_tidy$X455 / PE4_tidy$X564
),
data.frame(
dataset = "PE4",
ratio_type = "X592/X564",
ratio_value = PE4_tidy$X592 / PE4_tidy$X564
)
)
# 2. Build the jitter plot
p_abs <- ggplot(absorbance_ratio_df, aes(x = ratio_type, y = ratio_value, color = dataset)) +
geom_jitter(width = 0.1, alpha = 0.6) +
geom_hline(yintercept = 1.0, linetype = "dashed", color = "red") +
labs(
title = "Absorbance Ratios: X455/X564 and X592/X564",
x = "Absorbance Ratio Type",
y = "Ratio Value"
) +
theme_minimal()
# 3. Save to disk under output PE/plots/absorbance/
absorbance_dir <- file.path("output PE", "plots", "absorbance")
if (!dir.exists(absorbance_dir)) dir.create(absorbance_dir, recursive = TRUE)
ggsave(
filename = file.path(absorbance_dir, "Absorbance_Ratio_X455_X592.png"),
plot = p_abs,
width = 10,
height = 6,
dpi = 300,
bg = "white"
)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
# 4. Print so it appears in the knitted HTML
print(p_abs)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
The two panels show the distribution of absorbance ratios (X455/X564 on the left; X592/X564 on the right) for PE1–PE4 (colored points). A dashed horizontal line at 1.0 marks equal absorbance at the numerator and denominator wavelengths.
X455/X564 ratios span roughly 0.5 – 5.0 across all four datasets, with most values clustered between 1.0 and 3.0. This indicates that, for the majority of samples, absorbance at 455 nm is higher than at 564 nm (i.e., ratio > 1), although there are also some below 1.0.
X592/X564 ratios are much tighter, clustering mostly between 0.8 and 1.2 (nearly all below or near the dashed 1.0 line). In other words, absorbance at 592 nm tends to be equal to or lower than at 564 nm for nearly all PE1–PE4 samples.
Description of joindf_by_id Function
The joindf_by_id function takes two data frames
(df1 and df2) and merges them by matching
unique identifiers related to samples, specifically using either a
Cell_ID column or a plate well column.
Key steps and features: - Column
Cleaning: Trims whitespace from column names in both data
frames to avoid join errors caused by accidental spaces. - Key
Column Verification: Checks that at least one data frame
contains a Cell_ID column and the other contains a
plate well column—these serve as the join keys. -
Role Assignment: Depending on which data frame contains
Cell_ID, that data frame is assigned as the base
(df_cell), and the other becomes the joining data
(df_plate). - Rename Join Keys: Renames
both join columns to a common key name (join_id) to
facilitate a straightforward left join. - Perform Join:
Conducts a left join, keeping all rows from the base data frame and
adding matching data from the other. - Identify Unmatched
Rows: Any rows in the larger data frame without matches are
saved separately for troubleshooting. - Output
Files:
- Saves the merged data frame as a CSV named according to the provided
output_name.
- Writes unmatched rows into a separate CSV file.
- Global Environment Assignment: Assigns the merged
data frame into the global R environment under the same name as the
output file (minus the .csv extension). -
Reporting: Prints messages listing any unmatched
identifiers and returns a summary report containing counts of
matched/unmatched rows and file paths of saved CSVs.
# 1. Create output subdirectory
save_dir <- file.path("output PE", "export data", "joined_weights_PE")
dir.create(save_dir, recursive = TRUE, showWarnings = FALSE)
# 2. Build weight and PE data frame lists
list_weights <- list(
pe1_weights_id,
pe2_weights_id,
pe3_weights_id,
pe4_weights_id
)
PE_list <- list(
PE1 = PE1_tidy,
PE2 = PE2_tidy,
PE3 = PE3_tidy,
PE4 = PE4_tidy
)
# 3. Loop and join
mapply(
function(df1, df2, name) {
joindf_by_id(
df1 = df1,
df2 = df2,
output_name = file.path(save_dir, paste0(name, "_weights_joined.csv")),
unmatched_out = file.path(save_dir, paste0(name, "_weights_unmatched.csv")),
key_df1 = "Cell_ID",
key_df2 = "plate well"
)
},
df1 = PE_list,
df2 = list_weights,
name = names(PE_list),
SIMPLIFY = FALSE
)
## Join complete. Output saved to: output PE/export data/joined_weights_PE/PE1_weights_joined.csv
## Join complete. Output saved to: output PE/export data/joined_weights_PE/PE2_weights_joined.csv
## Join complete. Output saved to: output PE/export data/joined_weights_PE/PE3_weights_joined.csv
## Join complete. Output saved to: output PE/export data/joined_weights_PE/PE4_weights_joined.csv
## $PE1
## $PE1$result_df
## # A tibble: 96 × 11
## join_id X455 X564 X592 X618 X645 X565 ID `sample weight`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 A01 0 0 0 0 0 0 Blank01 0
## 2 A02 0.027 0.018 0.012 0.013 0.01 0.017 Lab_01 10.4
## 3 A03 0.026 0.019 0.012 0.012 0.009 0.019 Lab_01 10.3
## 4 A04 0.018 0.012 0.008 0.009 0.007 0.012 Lab_01 11.1
## 5 A05 0.035 0.019 0.016 0.015 0.011 0.016 Lab_02 10.4
## 6 A06 0.012 0.007 0.0050 0.0050 0.00300 0.0050 Lab_02 10.2
## 7 A07 0.022 0.013 0.011 0.011 0.008 0.012 Lab_02 10.3
## 8 A08 0.012 0.009 0.006 0.006 0.00400 0.008 Lab_03 10.6
## 9 A09 0.026 0.012 0.007 0.008 0.00400 0.01 Lab_03 10.5
## 10 A10 0.059 0.052 0.043 0.04 0.035 0.047 Lab_03 10.1
## # ℹ 86 more rows
## # ℹ 2 more variables: `Tray well` <chr>, date <dttm>
##
## $PE1$merged_cells
## [1] 96
##
## $PE1$unmatched_cells
## [1] 0
##
## $PE1$unmatched_saved_to
## [1] "output PE/export data/joined_weights_PE/PE1_weights_unmatched.csv"
##
## $PE1$assigned_to_global
## [1] TRUE
##
##
## $PE2
## $PE2$result_df
## # A tibble: 96 × 9
## join_id X564 X592 X455 X565 ID `sample weight` `Tray well`
## <chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <chr>
## 1 A01 2.33e-3 4.33e-3 2.33e-3 1.67e-3 Blan… NA A1
## 2 A02 -6.67e-4 -1.67e-3 -1.67e-3 -3.33e-4 Blan… NA A2
## 3 A03 -1.67e-3 -2.67e-3 -6.67e-4 -1.33e-3 Blan… NA A3
## 4 A04 1.33e-2 8.33e-3 1.23e-2 1.27e-2 Lab_… 24.5 A4
## 5 A05 1.18e-1 1.30e-1 1.95e-1 1.60e-1 Lab_… 24.4 A5
## 6 A06 1.13e-2 4.33e-3 1.33e-2 1.17e-2 Lab_… 24.1 A6
## 7 A07 3.33e-3 3.33e-4 7.33e-3 3.67e-3 Lab_… 23.4 A7
## 8 A08 3.33e-3 -6.67e-4 8.33e-3 3.67e-3 Lab_… 23.7 A8
## 9 A09 1.33e-3 -1.67e-3 7.33e-3 1.67e-3 Lab_… 25.6 A9
## 10 A10 1.13e-2 5.33e-3 1.83e-2 1.07e-2 Lab_… 23 A10
## # ℹ 86 more rows
## # ℹ 1 more variable: date <dttm>
##
## $PE2$merged_cells
## [1] 96
##
## $PE2$unmatched_cells
## [1] 0
##
## $PE2$unmatched_saved_to
## [1] "output PE/export data/joined_weights_PE/PE2_weights_unmatched.csv"
##
## $PE2$assigned_to_global
## [1] TRUE
##
##
## $PE3
## $PE3$result_df
## # A tibble: 50 × 9
## ID `sample weight` `Tray well` join_id date X564 X565
## <chr> <dbl> <chr> <chr> <dttm> <dbl> <dbl>
## 1 Lab_40 26.8 G1 A01 2025-05-15 00:00:00 0.0193 0.0193
## 2 Lab_40 24.2 G2 A02 2025-05-15 00:00:00 0.0133 0.0133
## 3 Lab_40 26.7 G3 A03 2025-05-15 00:00:00 0.0223 0.0223
## 4 Lab_41 23.4 G4 A04 2025-05-15 00:00:00 0.0223 0.0223
## 5 Lab_41 25.6 G5 A05 2025-05-15 00:00:00 0.0283 0.0273
## 6 Lab_41 27 G6 A06 2025-05-15 00:00:00 0.0343 0.0333
## 7 Lab_42 25 G7 A07 2025-05-15 00:00:00 0.0183 0.0183
## 8 Lab_42 25.7 G8 A08 2025-05-15 00:00:00 0.0193 0.0193
## 9 Lab_42 24.6 G9 A09 2025-05-15 00:00:00 0.0223 0.0223
## 10 Lab_43 25.6 G10 A10 2025-05-15 00:00:00 0.0603 0.0593
## # ℹ 40 more rows
## # ℹ 2 more variables: X592 <dbl>, X455 <dbl>
##
## $PE3$merged_cells
## [1] 50
##
## $PE3$unmatched_cells
## [1] 46
##
## $PE3$unmatched_saved_to
## [1] "output PE/export data/joined_weights_PE/PE3_weights_unmatched.csv"
##
## $PE3$assigned_to_global
## [1] TRUE
##
##
## $PE4
## $PE4$result_df
## # A tibble: 81 × 11
## ID `sample weight` `tray well` join_id date X564 X592 X455
## <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Lab_01 23.6 1 A01 29/05/… -1.33e-3 -4.33e-3 4.67e-3
## 2 Lab_01 24 2 A02 29/05/… -3.33e-3 -5.33e-3 -1.33e-3
## 3 Lab_02 27.1 3 A03 29/05/… -3.33e-3 -4.33e-3 2.67e-3
## 4 Lab_02 23.2 4 A04 29/05/… -3.33e-3 -5.33e-3 7.67e-3
## 5 Lab_03 25.4 5 A05 29/05/… 6.67e-4 -4.33e-3 1.07e-2
## 6 Lab_03 25.2 6 A06 29/05/… 2.57e-2 7.67e-3 3.57e-2
## 7 Lab_04 26.2 7 A07 29/05/… 2.67e-3 -2.33e-3 1.27e-2
## 8 Lab_04 25.2 8 A08 29/05/… 1.07e-2 6.67e-4 1.17e-2
## 9 Lab_05 23.3 9 A09 29/05/… -1.33e-3 -4.33e-3 5.67e-3
## 10 Lab_05 23.3 10 A10 29/05/… -2.33e-3 -4.33e-3 6.67e-4
## # ℹ 71 more rows
## # ℹ 3 more variables: X618 <dbl>, X645 <dbl>, `X592[2]` <dbl>
##
## $PE4$merged_cells
## [1] 81
##
## $PE4$unmatched_cells
## [1] 15
##
## $PE4$unmatched_saved_to
## [1] "output PE/export data/joined_weights_PE/PE4_weights_unmatched.csv"
##
## $PE4$assigned_to_global
## [1] TRUE
Each PE#_weights_joined data frame has been processed to compute PE_mg_per_mL, total_PE_mg, and PE_mg_per_g_sample.
Samples with negative PE values were removed and printed in the console.
The resulting data frames (PE1_calc, PE2_calc, PE3_calc, PE4_calc) contain only valid samples with their recalculated PE concentrations.
##STEP 6: CALCULATE PE CONTENT Calculating PE This
function takes a tidy data frame containing absorbance measurements and
calculates the concentration of phycoerythrin (PE) for each sample based
on the Beer & Eshel (1985) method.
It performs the following steps:
from Beer S, Eshel A (1985) Determining phycoerythrin and phycocyanin concentrations in aqueous crude extracts of red algae. Aust J Mar Freshwater Res 36:785–792, https://doi.org/10.1071/MF9850785.
Filtering Negative PE Values:
Samples with negative PE concentrations are identified and removed.
These removed samples are printed in the console with the reason for
removal.
Normalization to Sample Weight:
The PE concentration is converted from micrograms per milliliter (µg/mL)
to milligrams per gram of dry sample (mg/g) by adjusting for the extract
volume and the individual sample weights provided in the data frame.
This ensures accurate PE quantification based on the exact weight of
each sample rather than using a fixed default weight.
Output and Metadata:
The function returns a filtered data frame containing valid samples with
their calculated PE concentrations in mg/g. Additionally, it stores the
filtered-out rows (samples with negative PE) as an attribute called
"removed_rows_pe" for further inspection if
needed.
This process helps ensure the quality and accuracy of PE measurements by excluding invalid data and normalizing concentrations based on actual sample weights.
# Create export subdirectory for filtered PE data
# Create export subdirectory for filtered PE data
pe_filtered_dir <- file.path("output PE", "export data", "pe filtered")
dir.create(pe_filtered_dir, recursive = TRUE, showWarnings = FALSE)
# List of input dataframes
joined_data <- list(
PE1 = PE1_weights_joined,
PE2 = PE2_weights_joined,
PE3 = PE3_weights_joined,
PE4 = PE4_weights_joined
)
# Run calculate_pe_and_filter() for each dataset
invisible(
mapply(function(df, name) {
calculate_pe_and_filter(
tidy_df = df,
output_basename = paste0(name, "_calc"),
sample_id_col = "join_id",
sample_weight_col = "sample weight" # <- correct column name
)
},
df = joined_data,
name = names(joined_data),
SIMPLIFY = FALSE)
)
## Removed observations due to negative PE values:
## # A tibble: 16 × 3
## join_id PE_mg_per_mL Removal_Reason
## <chr> <dbl> <chr>
## 1 A05 -9.60e- 5 removed because PE < 0
## 2 A07 -2.40e- 5 removed because PE < 0
## 3 A11 -7.20e- 5 removed because PE < 0
## 4 B01 -2.40e- 5 removed because PE < 0
## 5 B05 -9.84e- 4 removed because PE < 0
## 6 D10 -1.44e- 4 removed because PE < 0
## 7 E08 -2.40e- 5 removed because PE < 0
## 8 E09 -6.77e-19 removed because PE < 0
## 9 F08 -4.80e- 5 removed because PE < 0
## 10 G02 -2.40e- 5 removed because PE < 0
## 11 G03 -7.20e- 5 removed because PE < 0
## 12 G04 -6.77e-19 removed because PE < 0
## 13 G05 -9.60e- 5 removed because PE < 0
## 14 G07 -2.64e- 4 removed because PE < 0
## 15 G09 -4.80e- 5 removed because PE < 0
## 16 H12 -4.80e- 5 removed because PE < 0
## Filtered data saved to: output PE/export data/pe filtered/PE1_calc.csv
## Removed observations due to negative PE values:
## # A tibble: 4 × 3
## join_id PE_mg_per_mL Removal_Reason
## <chr> <dbl> <chr>
## 1 A01 -0.000192 removed because PE < 0
## 2 A05 -0.00300 removed because PE < 0
## 3 E04 -0.00036 removed because PE < 0
## 4 F01 -0.00156 removed because PE < 0
## Filtered data saved to: output PE/export data/pe filtered/PE2_calc.csv
## Removed observations due to negative PE values:
## # A tibble: 1 × 3
## join_id PE_mg_per_mL Removal_Reason
## <chr> <dbl> <chr>
## 1 H11 -0.000136 removed because PE < 0
## Filtered data saved to: output PE/export data/pe filtered/PE3_calc.csv
## Removed observations due to negative PE values:
## # A tibble: 7 × 3
## join_id PE_mg_per_mL Removal_Reason
## <chr> <dbl> <chr>
## 1 A03 -0.0000480 removed because PE < 0
## 2 A04 -0.0000720 removed because PE < 0
## 3 C06 -0.0000240 removed because PE < 0
## 4 D11 -0.0000720 removed because PE < 0
## 5 D12 -0.0000240 removed because PE < 0
## 6 G08 -0.0000720 removed because PE < 0
## 7 G09 -0.0000240 removed because PE < 0
## Filtered data saved to: output PE/export data/pe filtered/PE4_calc.csv
Fluor1_tidy,
Fluor2_tidy, Fluor3_tidy,
Fluor4_tidy), we create a bar chart of raw
Xred fluorescence values by sample.Fluor#_tidy, build a ggplot bar chart where:
Cell_ID (the individual sample/well).Xred (fluorescence value).Xred value
(rounded to three decimals).<Fluor#>_fluorescence.png for each run.plotly::ggplotly() and store each in a list.htmltools::tagList() to render all interactive
plots together in the knitted HTML output.# 0. Setup output directory for plots
plot_dir <- file.path("output PE", "plots", "fluorescence_xred")
dir.create(plot_dir, recursive = TRUE, showWarnings = FALSE)
# 1. Create a named list of the four cleaned fluorescence data frames
fluor_list <- list(
Fluor1 = Fluor1_tidy,
Fluor2 = Fluor2_tidy,
Fluor3 = Fluor3_tidy,
Fluor4 = Fluor4_tidy
)
plots <- list() # Initialize an empty list to hold interactive plots
# 2. Loop through each Fluorescence run and generate plots
for (name in names(fluor_list)) {
df <- fluor_list[[name]]
# 2a. Build a ggplot bar chart of Xred by Cell_ID
p <- ggplot(df, aes(x = Cell_ID, y = Xred)) +
geom_bar(stat = "identity", fill = "skyblue") +
geom_text(aes(label = round(Xred, 3)), vjust = -0.5, size = 3) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)
) +
labs(
title = paste("Raw Xred Fluorescence –", name),
x = "Sample (Cell_ID)",
y = "Xred Fluorescence"
)
# 2b. Save the static PNG version to the subdirectory
filename <- file.path(plot_dir, paste0(name, "_fluorescence.png"))
ggsave(
filename = filename,
plot = p,
width = 10,
height = 6,
dpi = 300,
bg = "white"
)
# 2c. Convert to an interactive Plotly object and store
plots[[name]] <- ggplotly(p)
}
# 3. Display all interactive plots in the R Markdown HTML output
library(htmltools)
tagList(plots)
Each Fluorescence run’s bar chart shows the raw Xred value for every sample (Cell_ID).
Static PNG files (Fluor1_fluorescence.png, Fluor2_fluorescence.png, etc.) have been saved in the working directory.
Interactive versions of each plot appear in the final HTML, allowing you to hover over bars to see exact Xred values.
Fluor#_tidy) with its corresponding PE‐calculation dataset
(PE#_calc), so that each sample’s PE and Xred values are in
the same table.joindf_by_id() for Each Run:
PE1_calc with Fluor1_tidy on
join_id = Cell_ID.PE2_calc with Fluor2_tidy,
etc.pe_fluor1.csv) and assigned to a global object
(pe_fluor1, etc.).pe_fluor1, pe_fluor2,
pe_fluor3, pe_fluor4 in a list called
pe_fluor_all.run Column to Each Data Frame:
"run1", "run2",
"run3", and "run4"."Date" or whose name
contains “date”, attempt to parse as "%m/%d/%Y" first; if
that fails, retry as "%d/%m/%Y".PE_scaled = PE_mg_per_g_sample * 1000.combined_df,
containing all runs, a run factor, consistent date columns,
and PE_scaled.# Create subdirectory for this join group
pe_fluor_dir <- file.path("output PE", "export data", "pe_fluor_joins")
dir.create(pe_fluor_dir, recursive = TRUE, showWarnings = FALSE)
# Run joins with explicit output paths
joindf_by_id(
df1 = PE1_calc,
df2 = Fluor1_tidy,
output_name = file.path(pe_fluor_dir, "pe_fluor1_joined.csv"),
unmatched_out = file.path(pe_fluor_dir, "pe_fluor1_unmatched.csv"),
key_df1 = "join_id",
key_df2 = "Cell_ID"
)
## Join complete. Output saved to: output PE/export data/pe_fluor_joins/pe_fluor1_joined.csv
## $result_df
## # A tibble: 80 × 15
## join_id X455 X564 X592 X618 X645 X565 ID `sample weight`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 A01 0 0 0 0 0 0 Blank01 0
## 2 A02 0.027 0.018 0.012 0.013 0.01 0.017 Lab_01 10.4
## 3 A03 0.026 0.019 0.012 0.012 0.009 0.019 Lab_01 10.3
## 4 A04 0.018 0.012 0.008 0.009 0.007 0.012 Lab_01 11.1
## 5 A06 0.012 0.007 0.0050 0.0050 0.00300 0.0050 Lab_02 10.2
## 6 A08 0.012 0.009 0.006 0.006 0.00400 0.008 Lab_03 10.6
## 7 A09 0.026 0.012 0.007 0.008 0.00400 0.01 Lab_03 10.5
## 8 A10 0.059 0.052 0.043 0.04 0.035 0.047 Lab_03 10.1
## 9 A12 0.014 0.007 0.0050 0.0050 0.00300 0.007 Lab_04 10.6
## 10 B02 0.015 0.008 0.0050 0.0050 0.00400 0.008 Lab_05 10.9
## # ℹ 70 more rows
## # ℹ 6 more variables: `Tray well` <chr>, date <dttm>, PE_mg_per_mL <dbl>,
## # total_PE_mg <dbl>, PE_mg_per_g_sample <dbl>, Xred <dbl>
##
## $merged_cells
## [1] 80
##
## $unmatched_cells
## [1] 16
##
## $unmatched_saved_to
## [1] "output PE/export data/pe_fluor_joins/pe_fluor1_unmatched.csv"
##
## $assigned_to_global
## [1] TRUE
joindf_by_id(
df1 = PE2_calc,
df2 = Fluor2_tidy,
output_name = file.path(pe_fluor_dir, "pe_fluor2_joined.csv"),
unmatched_out = file.path(pe_fluor_dir, "pe_fluor2_unmatched.csv"),
key_df1 = "join_id",
key_df2 = "Cell_ID"
)
## Join complete. Output saved to: output PE/export data/pe_fluor_joins/pe_fluor2_joined.csv
## $result_df
## # A tibble: 92 × 13
## join_id X564 X592 X455 X565 ID `sample weight` `Tray well`
## <chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <chr>
## 1 A02 -6.67e-4 -1.67e-3 -1.67e-3 -3.33e-4 Blan… NA A2
## 2 A03 -1.67e-3 -2.67e-3 -6.67e-4 -1.33e-3 Blan… NA A3
## 3 A04 1.33e-2 8.33e-3 1.23e-2 1.27e-2 Lab_… 24.5 A4
## 4 A06 1.13e-2 4.33e-3 1.33e-2 1.17e-2 Lab_… 24.1 A6
## 5 A07 3.33e-3 3.33e-4 7.33e-3 3.67e-3 Lab_… 23.4 A7
## 6 A08 3.33e-3 -6.67e-4 8.33e-3 3.67e-3 Lab_… 23.7 A8
## 7 A09 1.33e-3 -1.67e-3 7.33e-3 1.67e-3 Lab_… 25.6 A9
## 8 A10 1.13e-2 5.33e-3 1.83e-2 1.07e-2 Lab_… 23 A10
## 9 A11 7.33e-3 1.33e-3 1.23e-2 7.67e-3 Lab_… 23.6 A11
## 10 A12 8.33e-3 1.33e-3 8.33e-3 8.67e-3 Lab_… 26.9 A12
## # ℹ 82 more rows
## # ℹ 5 more variables: date <dttm>, PE_mg_per_mL <dbl>, total_PE_mg <dbl>,
## # PE_mg_per_g_sample <dbl>, Xred <dbl>
##
## $merged_cells
## [1] 92
##
## $unmatched_cells
## [1] 4
##
## $unmatched_saved_to
## [1] "output PE/export data/pe_fluor_joins/pe_fluor2_unmatched.csv"
##
## $assigned_to_global
## [1] TRUE
joindf_by_id(
df1 = PE3_calc,
df2 = Fluor3_tidy,
output_name = file.path(pe_fluor_dir, "pe_fluor3_joined.csv"),
unmatched_out = file.path(pe_fluor_dir, "pe_fluor3_unmatched.csv"),
key_df1 = "join_id",
key_df2 = "Cell_ID"
)
## Join complete. Output saved to: output PE/export data/pe_fluor_joins/pe_fluor3_joined.csv
## $result_df
## # A tibble: 49 × 13
## ID `sample weight` `Tray well` join_id date X564 X565
## <chr> <dbl> <chr> <chr> <dttm> <dbl> <dbl>
## 1 Lab_40 26.8 G1 A01 2025-05-15 00:00:00 0.0193 0.0193
## 2 Lab_40 24.2 G2 A02 2025-05-15 00:00:00 0.0133 0.0133
## 3 Lab_40 26.7 G3 A03 2025-05-15 00:00:00 0.0223 0.0223
## 4 Lab_41 23.4 G4 A04 2025-05-15 00:00:00 0.0223 0.0223
## 5 Lab_41 25.6 G5 A05 2025-05-15 00:00:00 0.0283 0.0273
## 6 Lab_41 27 G6 A06 2025-05-15 00:00:00 0.0343 0.0333
## 7 Lab_42 25 G7 A07 2025-05-15 00:00:00 0.0183 0.0183
## 8 Lab_42 25.7 G8 A08 2025-05-15 00:00:00 0.0193 0.0193
## 9 Lab_42 24.6 G9 A09 2025-05-15 00:00:00 0.0223 0.0223
## 10 Lab_43 25.6 G10 A10 2025-05-15 00:00:00 0.0603 0.0593
## # ℹ 39 more rows
## # ℹ 6 more variables: X592 <dbl>, X455 <dbl>, PE_mg_per_mL <dbl>,
## # total_PE_mg <dbl>, PE_mg_per_g_sample <dbl>, Xred <dbl>
##
## $merged_cells
## [1] 49
##
## $unmatched_cells
## [1] 47
##
## $unmatched_saved_to
## [1] "output PE/export data/pe_fluor_joins/pe_fluor3_unmatched.csv"
##
## $assigned_to_global
## [1] TRUE
joindf_by_id(
df1 = PE4_calc,
df2 = Fluor4_tidy,
output_name = file.path(pe_fluor_dir, "pe_fluor4_joined.csv"),
unmatched_out = file.path(pe_fluor_dir, "pe_fluor4_unmatched.csv"),
key_df1 = "join_id",
key_df2 = "Cell_ID"
)
## Join complete. Output saved to: output PE/export data/pe_fluor_joins/pe_fluor4_joined.csv
## $result_df
## # A tibble: 74 × 15
## ID `sample weight` `tray well` join_id date X564 X592 X455
## <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Lab_01 23.6 1 A01 29/05/… -1.33e-3 -4.33e-3 4.67e-3
## 2 Lab_01 24 2 A02 29/05/… -3.33e-3 -5.33e-3 -1.33e-3
## 3 Lab_03 25.4 5 A05 29/05/… 6.67e-4 -4.33e-3 1.07e-2
## 4 Lab_03 25.2 6 A06 29/05/… 2.57e-2 7.67e-3 3.57e-2
## 5 Lab_04 26.2 7 A07 29/05/… 2.67e-3 -2.33e-3 1.27e-2
## 6 Lab_04 25.2 8 A08 29/05/… 1.07e-2 6.67e-4 1.17e-2
## 7 Lab_05 23.3 9 A09 29/05/… -1.33e-3 -4.33e-3 5.67e-3
## 8 Lab_05 23.3 10 A10 29/05/… -2.33e-3 -4.33e-3 6.67e-4
## 9 Lab_12 26.3 11 A11 29/05/… 2.67e-3 -1.33e-3 1.57e-2
## 10 Lab_12 26.5 12 A12 29/05/… 2.67e-3 -2.33e-3 1.37e-2
## # ℹ 64 more rows
## # ℹ 7 more variables: X618 <dbl>, X645 <dbl>, `X592[2]` <dbl>,
## # PE_mg_per_mL <dbl>, total_PE_mg <dbl>, PE_mg_per_g_sample <dbl>, Xred <dbl>
##
## $merged_cells
## [1] 74
##
## $unmatched_cells
## [1] 22
##
## $unmatched_saved_to
## [1] "output PE/export data/pe_fluor_joins/pe_fluor4_unmatched.csv"
##
## $assigned_to_global
## [1] TRUE
# Rename columns in one of the joined results (after it has been auto-assigned)
#colnames(pe_fluor4)[colnames(pe_fluor4_joined) == "sample ID"] <- "ID"
# Combine all joined dataframes into a list
pe_fluor_all <- list(pe_fluor1_joined, pe_fluor2_joined, pe_fluor3_joined, pe_fluor4_joined)
# Add a run identifier to each dataframe
pe_fluor_all <- Map(function(df, i) {
df$run <- factor(paste0("run", i))
df
}, pe_fluor_all, seq_along(pe_fluor_all))
# Find common columns across all runs
common_cols <- Reduce(intersect, lapply(pe_fluor_all, names))
#harmonize date columns
pe_fluor_all <- lapply(pe_fluor_all, function(df) {
df[] <- lapply(df, function(col) {
is_colname_date <- grepl("date", names(df)[which(sapply(df, identical, col))], ignore.case = TRUE)
if (inherits(col, "character") && is_colname_date) {
tryCatch(as.Date(col, format = "%m/%d/%Y"),
error = function(e) as.Date(col, format = "%d/%m/%Y"))
} else {
col
}
})
df
})
# Keep only the common columns in each dataframe before binding
combined_df <- bind_rows(
lapply(pe_fluor_all, function(df) df[common_cols]),
.id = "run"
)
#Finally merge all spreadsheets into one for replicate testing and analysis
#Scale PE bigger for regression purposes
combined_df <- combined_df %>%
mutate(PE_scaled = PE_mg_per_g_sample * 1000)
Each run’s PE + Fluorescence data was merged and saved (pe_fluor1, …, pe_fluor4).
combined_df now contains all four runs, only columns common to every run, a run factor, fixed date parsing, and a new column PE_scaled ready for regression analyses.
Xred
vs. PE_mg_per_g_sample) across every run in a single
scatterplot.run factor,
so we can see how the different runs overlap or differ.Xred_PE_all_runs.png and display it in the HTML.# Build a scatterplot of Xred vs. PE_mg_per_g_sample, colored by run
# Define directory for this plot group
pe_fluor_plot_dir <- file.path("output PE", "plots", "pe_fluor")
dir.create(pe_fluor_plot_dir, recursive = TRUE, showWarnings = FALSE)
# Create combined plot
p_all <- ggplot(combined_df, aes(x = PE_mg_per_g_sample, y = Xred, color = run)) +
geom_point(alpha = 0.6) +
theme_minimal() +
labs(
title = "Xred vs. PE_mg_per_g_sample (All Runs)",
x = "PE (mg/g sample)",
y = "Xred Fluorescence",
color = "Run"
) +
scale_color_brewer(palette = "Dark2") +
coord_cartesian(xlim = c(0, 0.5))
# Save the plot to the new subdirectory
ggsave(
filename = file.path(pe_fluor_plot_dir, "Xred_vs_PE_all_runs.png"),
plot = p_all,
width = 10,
height = 6,
dpi = 300,
bg = "white"
)
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Display the plot in the knitted HTML
print(p_all)
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).
The scatterplot shows how Xred (fluorescence) increases with PE_mg_per_g_sample (phycoerythrin content), with each run distinguished by color.
Most points from every run lie between 0 – 0.5 mg/g on the x‐axis, confirming that the hard‐coded limit captures the bulk of the data.
Outliers beyond PE = 0.5 mg/g are clipped from view but still exist in the dataset.
Plain English:
combined_df that need to be removed."run 3 fresh" to the run column."run 3 fresh" to samples in rows 169–189
(adjust indices if your row count shifts).combined_df <- combined_df[!is.na(combined_df$PE_mg_per_g_sample), ]
# 1. Add a new factor level "run 3 fresh" to the 'run' factor
levels(combined_df$run) <- c(levels(combined_df$run), "fresh run 3 & 4")
# 2. Assign "fresh run 3 & 4 to fresh samples only.
combined_df$run[grepl("fresh", combined_df$ID, ignore.case = TRUE)] <- "fresh run 3 & 4"
# 3. Remove rows with NA values for PE mg. There was fluoresence data, but no phycoerythrin
combined_export_path <- file.path("output PE", "export data", "combined_df.csv")
write.csv(combined_df, combined_export_path, row.names = FALSE)
A new level “run 3 fresh” has been added
# ── Subset to run 2, drop missing PE/Xred, and filter PE ≥ 10 µg/g (i.e. ≥ 0.01 mg/g) ─────
combined_df_run2 <- combined_df %>%
filter(run == "2") %>% # keep only run 2
drop_na(PE_mg_per_g_sample, Xred) %>% # drop any NA in PE or Xred
filter(PE_mg_per_g_sample >= 0.01) # remove PE < 0.01 mg/g
## STEP 11: REGRESSION MODELS BY RUN
# ── Fit an initial OLS (PE_mg_per_g_sample ~ Xred) and flag statistical outliers ─────────
model_initial_run2 <- lm(PE_mg_per_g_sample ~ Xred, data = combined_df_run2)
combined_df_run2 <- combined_df_run2 %>%
mutate(
resid_initial = residuals(model_initial_run2),
std_resid = rstandard(model_initial_run2),
is_outlier = abs(std_resid) >= 2
)
# How many points and how many outliers?
combined_df_run2 %>%
summarize(
total_points = n(),
n_outliers = sum(is_outlier),
pct_outliers = mean(is_outlier) * 100
) %>%
print()
## # A tibble: 1 × 3
## total_points n_outliers pct_outliers
## <int> <int> <dbl>
## 1 81 4 4.94
# Visualize initial model with outliers highlighted
ggplot(combined_df_run2, aes(x = Xred, y = PE_mg_per_g_sample, color = is_outlier)) +
geom_point(size = 2, alpha = 0.7) +
scale_color_manual(values = c("FALSE" = "steelblue", "TRUE" = "firebrick")) +
geom_abline(
slope = coef(model_initial_run2)["Xred"],
intercept = coef(model_initial_run2)["(Intercept)"],
color = "darkgreen", size = 1
) +
labs(
title = "Run 2: Initial PE vs. Xred (Red = |Std Resid| ≥ 2)",
x = "Xred (raw fluorescence)",
y = "PE (mg per g sample)"
) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# ── Remove flagged outliers and refit the “clean” calibration model ───────────────────────
combined_df_run2_clean <- combined_df_run2 %>%
filter(!is_outlier)
model_clean_run2 <- lm(PE_mg_per_g_sample ~ Xred, data = combined_df_run2_clean)
# Summarize the clean model
tidy(model_clean_run2) %>% print()
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.00268 0.00158 1.69 9.52e- 2
## 2 Xred 0.00000527 0.000000154 34.3 1.49e-47
summary(model_clean_run2)$r.squared %>% print()
## [1] 0.9399393
# Visualize the final fit on run 2 (outliers removed)
ggplot(combined_df_run2_clean, aes(x = Xred, y = PE_mg_per_g_sample)) +
geom_point(color = "steelblue", size = 2, alpha = 0.7) +
geom_smooth(method = "lm", color = "darkgreen", se = FALSE) +
labs(
title = "Run 2: Final Calibration (Outliers Removed)",
x = "Xred (raw fluorescence)",
y = "PE (mg per g sample)"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# ── Define a function to add fluorescence‐predicted PE to any dataframe ──────────────────
# Extract coefficients from the clean run‐2 model
coef_intercept <- coef(model_clean_run2)["(Intercept)"]
coef_slope <- coef(model_clean_run2)["Xred"]
#' calc_PE_from_Xred
#'
#' Given a dataframe containing a column of fluorescence values (Xred),
#' adds a new column with predicted PE (mg/g) based on the cleaned Run 2 calibration.
#'
#' @param df A data frame.
#' @param fluor_col String: name of the column in df containing the fluorescence (Xred) values.
#' @param new_col String: desired name for the new predicted PE column (default = "PE_predicted_mg_per_g").
#' @return A new data frame with an additional column `new_col`.
# apply to the master dataset
combined_df <- calc_PE_from_Xred(combined_df, fluor_col = "Xred", new_col = "PE_pred_run2_mg_per_g")
# View the first few rows to confirm
combined_df %>%
select(join_id, run, Xred, PE_mg_per_g_sample, PE_pred_run2_mg_per_g) %>%
head(10) %>%
print()
## # A tibble: 10 × 5
## join_id run Xred PE_mg_per_g_sample PE_pred_run2_mg_per_g
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 A02 1 16642 0.0519 0.0904
## 2 A03 1 24823 0.0734 0.134
## 3 A04 1 15650 0.0324 0.0852
## 4 A06 1 2119 0.0106 0.0138
## 5 A08 1 10796 0.0306 0.0596
## 6 A09 1 21089 0.0206 0.114
## 7 A10 1 32338 0.103 0.173
## 8 A12 1 7922 0.00340 0.0444
## 9 B02 1 6026 0.0165 0.0344
## 10 B03 1 3957 0.00947 0.0235
# ── (Optional) Function to also add standard‐error-of‐fit using the clean Run 2 model ────────
#' add_PE_with_se
#'
#' Given a dataframe containing a column of fluorescence values (Xred),
#' adds two new columns:
#' 1) Predicted PE (mg/g) based on the cleaned Run 2 model
#' 2) Standard error of the fitted mean (SE) for each prediction
#'
#' @param df A data frame.
#' @param fluor_col String: name of the column in df containing the fluorescence (Xred) values.
#' @param pred_col String: name of the new predicted PE column.
#' @param se_col String: name of the new standard‐error column.
#' @return A new data frame with additional columns `pred_col` and `se_col`.
# Example: apply to the master dataset
combined_df <- add_PE_with_se(combined_df,
fluor_col = "Xred",
pred_col = "PE_pred_run2_mg_per_g",
se_col = "PE_se_run2")
# View the first few rows to confirm
combined_df %>%
select(join_id, run, Xred, PE_mg_per_g_sample,
PE_pred_run2_mg_per_g, PE_se_run2) %>%
head(10) %>%
print()
## # A tibble: 10 × 6
## join_id run Xred PE_mg_per_g_sample PE_pred_run2_mg_per_g PE_se_run2
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 A02 1 16642 0.0519 0.0904 0.00168
## 2 A03 1 24823 0.0734 0.134 0.00279
## 3 A04 1 15650 0.0324 0.0852 0.00156
## 4 A06 1 2119 0.0106 0.0138 0.00135
## 5 A08 1 10796 0.0306 0.0596 0.00110
## 6 A09 1 21089 0.0206 0.114 0.00226
## 7 A10 1 32338 0.103 0.173 0.00389
## 8 A12 1 7922 0.00340 0.0444 0.00101
## 9 B02 1 6026 0.0165 0.0344 0.00105
## 10 B03 1 3957 0.00947 0.0235 0.00118
# Compute differences between measured and predicted PE
de <- "output PE/plots"
if (!dir.exists(de)) dir.create(de, recursive = TRUE)
df_compare <- combined_df %>%
filter(!is.na(PE_mg_per_g_sample), !is.na(PE_pred_run2_mg_per_g)) %>%
mutate(
diff = PE_mg_per_g_sample - PE_pred_run2_mg_per_g,
diff_abs = abs(diff),
mean_PE = (PE_mg_per_g_sample + PE_pred_run2_mg_per_g) / 2
)
# 1) Scatter: Measured vs Predicted PE
p1 <- ggplot(df_compare, aes(x = PE_pred_run2_mg_per_g, y = PE_mg_per_g_sample, color = run)) +
geom_point(size = 2) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
geom_text_repel(aes(label = join_id), size = 3, max.overlaps = 20) +
labs(
title = "Measured vs. Predicted PE by Run",
x = "Predicted PE (mg/g)",
y = "Measured PE (mg/g)",
color = "Run"
) +
theme_minimal()
print(p1)
# Save plot
ggsave(filename = file.path("output PE/plots", "measured_vs_predicted_PE.png"), plot = p1, width = 6, height = 4)
# 2) Residuals vs Predicted
p2 <- ggplot(df_compare, aes(x = PE_pred_run2_mg_per_g, y = diff)) +
geom_point( size = 2) +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(
title = "Residuals vs. Predicted",
x = "Predicted PE (mg/g)",
y = "Measured – Predicted (mg/g)",
color = "Run"
) +
theme_minimal()
print(p2)
# Save plot
ggsave(filename = file.path(de, "residuals_vs_predicted_PE.png"), plot = p2, width = 6, height = 4)
# 3) Bland–Altman plot
p3 <- ggplot(df_compare, aes(x = mean_PE, y = diff)) +
geom_point(color = "darkgreen", size = 2) +
geom_hline(yintercept = mean(df_compare$diff), color = "blue") +
geom_hline(yintercept = mean(df_compare$diff) + 1.96 * sd(df_compare$diff),
linetype = "dashed", color = "grey50") +
geom_hline(yintercept = mean(df_compare$diff) - 1.96 * sd(df_compare$diff),
linetype = "dashed", color = "grey50") +
labs(
title = "Bland–Altman Plot",
x = "Mean of Measured & Predicted PE (mg/g)",
y = "Difference (Measured – Predicted)"
) +
theme_minimal()
print(p3)
# Save plot
ggsave(filename = file.path(de, "bland_altman_PE.png"), plot = p3, width = 6, height = 4)
# Extract and save top 20 samples with largest absolute differences
top20_diff <- df_compare %>%
arrange(desc(diff_abs)) %>%
slice(1:20) %>%
select(join_id, run, PE_mg_per_g_sample, PE_pred_run2_mg_per_g, diff, diff_abs)
# Print top 20 to console
print(top20_diff)
## # A tibble: 20 × 6
## join_id run PE_mg_per_g_sample PE_pred_run2_mg_per_g diff diff_abs
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 G07 4 Inf 0.00268 Inf Inf
## 2 G06 4 0.539 0.203 0.336 0.336
## 3 B09 1 0.00686 0.224 -0.217 0.217
## 4 B07 1 0.238 0.0417 0.196 0.196
## 5 G04 4 0.338 0.159 0.180 0.180
## 6 E01 3 0.498 0.333 0.165 0.165
## 7 D02 1 0.11 0.255 -0.145 0.145
## 8 C12 3 0.590 0.450 0.140 0.140
## 9 C03 1 0.129 0.239 -0.109 0.109
## 10 G03 4 0.224 0.119 0.106 0.106
## 11 H06 2 0.344 0.241 0.103 0.103
## 12 D03 1 0.101 0.202 -0.101 0.101
## 13 F01 1 0.127 0.225 -0.0973 0.0973
## 14 G12 1 0.0675 0.164 -0.0961 0.0961
## 15 F03 1 0.127 0.223 -0.0956 0.0956
## 16 F06 1 0.0928 0.187 -0.0940 0.0940
## 17 A09 1 0.0206 0.114 -0.0933 0.0933
## 18 F05 1 0.107 0.198 -0.0912 0.0912
## 19 F04 1 0.130 0.217 -0.0872 0.0872
## 20 H02 1 0.0648 0.147 -0.0822 0.0822
# Create export directory for data if it doesn't exist
export_dir <- "output PE/export data"
if (!dir.exists(export_dir)) dir.create(export_dir, recursive = TRUE)
# Save to CSV in the export directory
write_csv(top20_diff, file.path(export_dir, "top20_PE_differences.csv"))
For each unique run in combined_df, we will:
1. Filter to that run’s subset of data.
2. If the run has fewer than 4 samples, skip with a
warning.
3. Fit three regression models:
- Linear:
\[
X_{\text{red}} \sim PE\_scaled
\]
- Log:
\[
X_{\text{red}} \sim \log\bigl(PE\_scaled + 0.001\bigr)
\]
- Polynomial:
\[
X_{\text{red}} \sim PE\_scaled \;+\; I\bigl(PE\_scaled^2\bigr)
\]
4. Extract R² and p‐values for each model using
get_stats().
5. Build a ggplot that:
- Plots raw points (geom_point).
- Adds fitted lines:
- Blue = linear model
- Green (dashed) = log model
- Red (dot‐dash) = polynomial model
- Annotates each line’s R²/p‐value in the top‐right corner.
6. Save each run’s plot under
plots/regression/<run>/Xred_PE_regressions_<run>.png.
7. Print each plot so it appears in the knitted
HTML.
# 1. Ensure 'run' is treated as a factor
combined_df$run <- as.factor(combined_df$run)
# 2. Create base directory for all regression plots
reg_dir <- file.path("output PE", "plots", "pe_fluor_regressions")
dir.create(reg_dir, recursive = TRUE, showWarnings = FALSE)
# 3. Loop over each unique run identifier
for (r in unique(as.character(combined_df$run))) {
# 3a. Subset to only this run
# 3a. Subset to only this run
df_sub <- combined_df %>% filter(run == r)
# 3a.1 Clean the data
df_sub <- df_sub %>%
filter(!is.na(PE_scaled), !is.na(Xred),
is.finite(PE_scaled), is.finite(Xred),
PE_scaled > -0.001)
# 3b. Skip if too few data points
if (nrow(df_sub) < 4) {
warning(paste("Skipping run", r, "- not enough data (n =", nrow(df_sub), ")"))
next
}
# 3c. Fit models
model_linear <- lm(Xred ~ PE_scaled, data = df_sub)
model_log <- lm(Xred ~ log(PE_scaled + 0.001), data = df_sub)
model_poly <- lm(Xred ~ PE_scaled + I(PE_scaled^2), data = df_sub)
# 3d. Get stats
ann_linear <- get_stats(model_linear, "Linear")
ann_log <- get_stats(model_log, "Log")
ann_poly <- get_stats(model_poly, "Poly")
# 3e. Create plot
p <- ggplot(df_sub, aes(x = PE_scaled, y = Xred)) +
geom_point(alpha = 0.6, color = "black") +
stat_smooth(method = "lm", formula = y ~ x,
se = FALSE, color = "blue") +
stat_smooth(method = "lm", formula = y ~ log(x + 0.001),
se = FALSE, color = "green", linetype = "dashed") +
stat_smooth(method = "lm", formula = y ~ x + I(x^2),
se = FALSE, color = "red", linetype = "dotdash") +
annotate("text", x = Inf, y = Inf, label = ann_linear,
hjust = 1.05, vjust = 2, color = "blue", size = 4) +
annotate("text", x = Inf, y = Inf, label = ann_log,
hjust = 1.05, vjust = 3.5, color = "green", size = 4) +
annotate("text", x = Inf, y = Inf, label = ann_poly,
hjust = 1.05, vjust = 5, color = "red", size = 4) +
theme_minimal() +
labs(
title = paste("Regression Models: Xred vs PE (", r, ")"),
x = "PE_scaled (mg/g sample × 1000)",
y = "Xred Fluorescence"
)
# 3f. Save the plot (all to one shared folder)
filename <- paste0("Xred_PE_regressions_", r, ".png")
ggsave(
filename = file.path(reg_dir, filename),
plot = p,
width = 10,
height = 6,
dpi = 300,
bg = "white"
)
# 3g. Show plot in knitted HTML
print(p)
}
analyze_replicates() function will:
id_col) and perform
calculations on numeric columns (e.g., PE_mg_per_g_sample,
Xred).choose_best_3 = TRUE) by minimizing the coefficient of
variation (CV) before calculating summary stats.<output_prefix>_summary.csv)
with one row per sample, containing all summary metrics.<output_prefix>_summary).Below we run two passes of analyze_replicates() on
combined_df: 1. Without enhanced CV‐based selection
(choose_best_3 = FALSE), saving as
all_rep_analy_summary.csv.
2. With enhanced selection (choose_best_3 = TRUE), saving
as E_rep_analy_summary.csv.
Finally, we use graph_histograms_with_error() to plot
histograms with error bars for key variables
(PE_mg_per_g_sample and Xred).
# 1. Analyze replicates without enhanced (best-3) selection
analyze_replicates(
data = combined_df,
id_col = "ID", # column that uniquely identifies each sample
join_col = "join_id", # also used for joining, same as id_col here
weight_col = "sample weight", # column containing sample weight in grams
date_col = "date", # column containing sample collection date
output_prefix = "all_rep_analy", # prefix for output files (will produce all_rep_analy_summary.csv)
choose_best_3 = FALSE, # do not filter replicates, use all
dir = "output PE/export data/"
)
## Summary saved to: output PE/export data/all_rep_analy_summary.csv
## # A tibble: 59 × 65
## ID PE_mg_per_g_sample_mean PE_mg_per_mL_mean PE_pred_run2_mg_per_g_mean
## <fct> <dbl> <dbl> <dbl>
## 1 BLANK_01 Inf 0.0000960 0.00268
## 2 Lab_01 0.0351 0.000360 0.0558
## 3 Lab_02 0.0120 0.000168 0.0100
## 4 Lab_03 0.0438 0.000540 0.0687
## 5 Lab_04 0.0254 0.000412 0.0295
## 6 Lab_05 0.0199 0.000279 0.0251
## 7 Lab_06 0.0972 0.00101 0.0475
## 8 Lab_07 0.0548 0.000596 0.101
## 9 Lab_08 0.0321 0.000440 0.0462
## 10 Lab_09 0.0693 0.0006 0.101
## # ℹ 49 more rows
## # ℹ 61 more variables: PE_scaled_mean <dbl>, PE_se_run2_mean <dbl>,
## # X455_mean <dbl>, X564_mean <dbl>, X592_mean <dbl>, Xred_mean <dbl>,
## # total_PE_mg_mean <dbl>, PE_mg_per_g_sample_sd <dbl>, PE_mg_per_mL_sd <dbl>,
## # PE_pred_run2_mg_per_g_sd <dbl>, PE_scaled_sd <dbl>, PE_se_run2_sd <dbl>,
## # X455_sd <dbl>, X564_sd <dbl>, X592_sd <dbl>, Xred_sd <dbl>,
## # total_PE_mg_sd <dbl>, PE_mg_per_g_sample_se <dbl>, PE_mg_per_mL_se <dbl>, …
# 2. Analyze replicates with enhanced (best-3) selection by CV
analyze_replicates(
data = combined_df,
id_col = "ID", # column uniquely identifying each sample
join_col = "join_id", # join key for replicate grouping
weight_col = "sample weight", # sample weight column (grams)
date_col = "date", # collection date column
output_prefix = "E_rep_analy", # prefix for output files (will produce E_rep_analy_summary.csv)
choose_best_3 = TRUE,
dir = "output PE/export data/"
# select the best 3 replicates (lowest CV) per sample
)
## Summary saved to: output PE/export data/E_rep_analy_summary.csv
## # A tibble: 59 × 85
## ID PE_mg_per_g_sample_mean PE_mg_per_mL_mean PE_pred_run2_mg_per_g_mean
## <fct> <dbl> <dbl> <dbl>
## 1 BLANK_01 Inf 0.0000960 0.00268
## 2 Lab_01 0.0340 0.000544 0.103
## 3 Lab_02 0.0104 0.000200 0.0111
## 4 Lab_03 0.0287 0.000608 0.0255
## 5 Lab_04 0.0114 0.000192 0.0449
## 6 Lab_05 0.0163 0.000120 0.0294
## 7 Lab_06 0.0755 0.00142 0.0441
## 8 Lab_07 0.0843 0.000728 0.166
## 9 Lab_08 0.0441 0.000744 0.0448
## 10 Lab_09 0.112 0.000760 0.0271
## # ℹ 49 more rows
## # ℹ 81 more variables: PE_scaled_mean <dbl>, PE_se_run2_mean <dbl>,
## # X455_mean <dbl>, X564_mean <dbl>, X592_mean <dbl>, Xred_mean <dbl>,
## # total_PE_mg_mean <dbl>, PE_mg_per_g_sample_sd <dbl>, PE_mg_per_mL_sd <dbl>,
## # PE_pred_run2_mg_per_g_sd <dbl>, PE_scaled_sd <dbl>, PE_se_run2_sd <dbl>,
## # X455_sd <dbl>, X564_sd <dbl>, X592_sd <dbl>, Xred_sd <dbl>,
## # total_PE_mg_sd <dbl>, PE_mg_per_g_sample_se <dbl>, PE_mg_per_mL_se <dbl>, …
# 3. Generate histograms with error bars for key variables
## SAVE HISTOGRAMS WITH ERROR BARS AS PNGS
# Create output folder if it doesn't exist
"PE_mg_per_g_sample", "Xred")."<variable>_se" (e.g.,
"PE_mg_per_g_sample_se", "Xred_se").graph_replicates_custom_error() once for
each pair (value_col, se_col)."<variable>_replicate_analysis_plots" (based on
output_prefix) and save a PNG of the bar plot with error
bars.# 1. Define the variables to plot
variables <- c("PE_mg_per_g_sample", "Xred", "X564")
# 2. Define shared output directory
rep_plot_dir <- file.path("output PE", "plots", "replicate_analysis")
dir.create(rep_plot_dir, recursive = TRUE, showWarnings = FALSE)
# 3. Read summary data
E_rep_analy_summary <- read.csv("output PE/export data/E_rep_analy_summary.csv")
# 4. Initialize list for interactive plotly plots
interactive_plots <- list()
# 5. Loop through each variable and generate plots
for (var in variables) {
se_col_name <- paste0(var, "_se")
mean_col_name <- paste0(var, "_mean")
output_prefix <- file.path(rep_plot_dir, paste0(var, "_replicate_analysis"))
interactive_plot <- graph_replicates_custom_error(
data = E_rep_analy_summary,
id_col = "ID",
value_col = mean_col_name,
se_col = se_col_name,
output_prefix = output_prefix # e.g. PE/output_PE/plots/replicate_analysis/Xred_replicate_analysis
)
interactive_plots[[var]] <- interactive_plot
}
## Saving 7 x 5 in image
## Plot saved to: output PE/plots/replicate_analysis/PE_mg_per_g_sample_replicate_analysis_plots/PE_mg_per_g_sample_mean_replicates.png
##
## Saving 7 x 5 in image
## Plot saved to: output PE/plots/replicate_analysis/Xred_replicate_analysis_plots/Xred_mean_replicates.png
##
## Saving 7 x 5 in image
## Plot saved to: output PE/plots/replicate_analysis/X564_replicate_analysis_plots/X564_mean_replicates.png
# 6. Display plots in R Markdown HTML output
htmltools::tagList(interactive_plots)
Combined_df_before <- combined_df[combined_df$run != "4",]
analyze_replicates(
data = Combined_df_before,
id_col = "ID", # column uniquely identifying each sample
join_col = "join_id", # join key for replicate grouping
weight_col = "sample weight", # sample weight column (grams)
date_col = "date", # collection date column
output_prefix = "E_rep_analy_before",
choose_best_3 = TRUE,
dir = "output PE/export data/"
)
## Warning in max(abs(df_best$value - mean_val)/mean_val * 100, na.rm = TRUE): no
## non-missing arguments to max; returning -Inf
## Warning in max(abs(df_best$value - mean_val)/mean_val * 100, na.rm = TRUE): no
## non-missing arguments to max; returning -Inf
## Warning in max(abs(df_best$value - mean_val)/mean_val * 100, na.rm = TRUE): no
## non-missing arguments to max; returning -Inf
## Warning in max(abs(df_best$value - mean_val)/mean_val * 100, na.rm = TRUE): no
## non-missing arguments to max; returning -Inf
## Warning in max(abs(df_best$value - mean_val)/mean_val * 100, na.rm = TRUE): no
## non-missing arguments to max; returning -Inf
## Warning in max(abs(df_best$value - mean_val)/mean_val * 100, na.rm = TRUE): no
## non-missing arguments to max; returning -Inf
## Summary saved to: output PE/export data/E_rep_analy_before_summary.csv
## # A tibble: 48 × 85
## ID PE_mg_per_g_sample_mean PE_mg_per_mL_mean PE_pred_run2_mg_per_g_mean
## <fct> <dbl> <dbl> <dbl>
## 1 Lab_01 0.0340 0.000544 0.103
## 2 Lab_02 0.0104 0.000200 0.0111
## 3 Lab_03 0.0287 0.000608 0.0271
## 4 Lab_04 0.00791 0.000120 0.0356
## 5 Lab_05 0.0163 0.0000960 0.0294
## 6 Lab_06 0.0755 0.00142 0.0441
## 7 Lab_07 0.0843 0.000728 0.166
## 8 Lab_08 0.0441 0.000744 0.0448
## 9 Lab_09 0.112 0.000760 0.0271
## 10 Lab_10 0.0448 0.000728 0.0570
## # ℹ 38 more rows
## # ℹ 81 more variables: PE_scaled_mean <dbl>, PE_se_run2_mean <dbl>,
## # X455_mean <dbl>, X564_mean <dbl>, X592_mean <dbl>, Xred_mean <dbl>,
## # total_PE_mg_mean <dbl>, PE_mg_per_g_sample_sd <dbl>, PE_mg_per_mL_sd <dbl>,
## # PE_pred_run2_mg_per_g_sd <dbl>, PE_scaled_sd <dbl>, PE_se_run2_sd <dbl>,
## # X455_sd <dbl>, X564_sd <dbl>, X592_sd <dbl>, Xred_sd <dbl>,
## # total_PE_mg_sd <dbl>, PE_mg_per_g_sample_se <dbl>, PE_mg_per_mL_se <dbl>, …
Before <- read.csv("output PE/export data/E_rep_analy_before_summary.csv")
E_rep_analy_summary <- E_rep_analy_summary[-1,]
se_change <- mean(E_rep_analy_summary$PE_mg_per_g_sample_se, na.rm = TRUE)- mean(Before$PE_mg_per_g_sample_se, na.rm=T)
paste("sample se for replicates change by:", se_change)
## [1] "sample se for replicates change by: -0.00382526703367693"
Check effects of enhancement algorithm
PErep_enhanced <- read_csv("output PE/export data/E_rep_analy_summary.csv") #retrieve analized replicates from enhanced CV
## Rows: 59 Columns: 85
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): ID, join_ids, replicate_date
## dbl (62): PE_mg_per_g_sample_mean, PE_mg_per_mL_mean, PE_pred_run2_mg_per_g_...
## num (20): PE_mg_per_g_sample_included_rows, PE_mg_per_mL_included_rows, PE_p...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
PErep_all <- read_csv("output PE/export data/all_rep_analy_summary.csv") #retrieve analized replicates without enhancement
## Rows: 59 Columns: 65
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): ID, join_ids, replicate_date
## dbl (62): PE_mg_per_g_sample_mean, PE_mg_per_mL_mean, PE_pred_run2_mg_per_g_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Note, this is a product of analyze_replicates using enhanced replicate selection.
SE_change <- PErep_enhanced$PE_mg_per_g_sample_max_dev_pct -PErep_all$PE_mg_per_g_sample_max_dev_pct
print(SE_change)
## [1] NaN -95.108168 -19.902274 -128.706311 -108.064228 -185.850844
## [7] -108.785120 -58.363908 -36.616304 -69.382077 -75.855386 -53.173964
## [13] -243.116436 -74.728616 -97.153013 -27.285558 -248.700651 -140.641449
## [19] -140.486124 -53.367967 -69.692891 -41.812381 -56.191109 -25.362519
## [25] -61.400035 -43.591519 -51.338031 -54.883972 -115.956778 -44.245268
## [31] -124.140828 -72.129501 -70.399559 -82.781859 -88.693591 -90.270432
## [37] -57.064369 -147.851170 -104.640528 -15.885761 -13.205278 -21.560248
## [43] 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [49] 0.000000 0.000000 -60.386760 0.000000 0.000000 -7.568183
## [55] -170.931322 -57.869010 -46.732619 0.000000 0.000000
SE_change <- SE_change[-51] #remove one NaN at the end
paste("# number of replicates SE acted upon:", length(SE_change[SE_change != 0]))
## [1] "# number of replicates SE acted upon: 46"
paste("average improvement by max deviation as a percent of the mean:", abs(mean(SE_change[SE_change != 0], na.rm = TRUE)))
## [1] "average improvement by max deviation as a percent of the mean: 82.2552702172294"
Plain English:
PErep_enhanced) with our sample metadata
(Sample data) so that we can run group‐level comparisons
(e.g., by location, variety, life stage).PE_mg_per_g_sample_mean using
graph_replicates_custom_error().compare_groups() four times to produce boxplots
(with significance letters) for:
# 1. Merge enhanced replicate summary with metadata by column "ID"
# This will save "PErep_final.csv" and assign the merged data frame to `PErep_final`.
# 1. Define export folder and create it if needed
final_export_dir <- file.path("output PE", "export data", "Samples Analysis Final")
dir.create(final_export_dir, recursive = TRUE, showWarnings = FALSE)
# 2. Join PErep_enhanced with Sample data
joindf_by_id(
df1 = PErep_enhanced,
df2 = `Sample data`,
output_name = file.path(final_export_dir, "PErep_final.csv"),
unmatched_out = file.path(final_export_dir, "PErep_unmatched.csv"),
key_df1 = "ID",
key_df2 = "ID"
)
## Join complete. Output saved to: output PE/export data/Samples Analysis Final/PErep_final.csv
## $result_df
## # A tibble: 59 × 92
## join_id PE_mg_per_g_sample_mean PE_mg_per_mL_mean PE_pred_run2_mg_per_g_mean
## <chr> <dbl> <dbl> <dbl>
## 1 BLANK_01 Inf 0.0000960 0.00268
## 2 Lab_01 0.0340 0.000544 0.103
## 3 Lab_02 0.0104 0.0002 0.0111
## 4 Lab_03 0.0287 0.000608 0.0255
## 5 Lab_04 0.0114 0.000192 0.0449
## 6 Lab_05 0.0163 0.00012 0.0294
## 7 Lab_06 0.0755 0.00142 0.0441
## 8 Lab_07 0.0843 0.000728 0.166
## 9 Lab_08 0.0441 0.000744 0.0448
## 10 Lab_09 0.112 0.00076 0.0271
## # ℹ 49 more rows
## # ℹ 88 more variables: PE_scaled_mean <dbl>, PE_se_run2_mean <dbl>,
## # X455_mean <dbl>, X564_mean <dbl>, X592_mean <dbl>, Xred_mean <dbl>,
## # total_PE_mg_mean <dbl>, PE_mg_per_g_sample_sd <dbl>, PE_mg_per_mL_sd <dbl>,
## # PE_pred_run2_mg_per_g_sd <dbl>, PE_scaled_sd <dbl>, PE_se_run2_sd <dbl>,
## # X455_sd <dbl>, X564_sd <dbl>, X592_sd <dbl>, Xred_sd <dbl>,
## # total_PE_mg_sd <dbl>, PE_mg_per_g_sample_se <dbl>, PE_mg_per_mL_se <dbl>, …
##
## $merged_cells
## [1] 57
##
## $unmatched_cells
## [1] 11
##
## $unmatched_saved_to
## [1] "output PE/export data/Samples Analysis Final/PErep_unmatched.csv"
##
## $assigned_to_global
## [1] TRUE
# 3. Read in the joined summary
PErep_final <- readr::read_csv(file.path(final_export_dir, "PErep_final.csv"))
## Rows: 59 Columns: 92
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): join_id, join_ids, replicate_date, Sample Code, Location, region, ...
## dbl (82): PE_mg_per_g_sample_mean, PE_mg_per_mL_mean, PE_pred_run2_mg_per_g_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# 4. Define output directory for replicate analysis plots
rep_plot_dir <- file.path("output PE", "plots", "replicate_analysis")
dir.create(rep_plot_dir, recursive = TRUE, showWarnings = FALSE)
# 5. Generate histogram with error bars for PE_mg_per_g_sample_mean
graph_replicates_custom_error(
data = PErep_final,
id_col = "join_id",
value_col = "PE_mg_per_g_sample_mean",
se_col = "PE_mg_per_g_sample_se",
output_prefix = file.path(rep_plot_dir, "E_rep_analy")
)
## Saving 7 x 5 in image
## Plot saved to: output PE/plots/replicate_analysis/E_rep_analy_plots/PE_mg_per_g_sample_mean_replicates.png
PE_location <- PErep_final %>%
filter(!Location %in% c("Lima Market Freeze Dry", "Ilo Freeze Dry", "Ilo oven dry", "Ilo Fresh", "Lima Market Fresh"))
PE_location_cham <- PErep_final %>%
filter(!variety %in% c("F.Glom"))
# 6. Run group comparisons and print outputs
###################################################Location
compare_groups(
data = PE_location_cham,
response_var = "PE_mg_per_g_sample_mean",
group_var = "Location",
subfolder_name = "PE_Location_cham_A"
)
##
## Attaching package: 'ggpubr'
##
## The following objects are masked from 'package:datawizard':
##
## mean_sd, median_mad
##
## For one-way between subjects designs, partial eta squared is equivalent
## to eta squared. Returning eta squared.
compare_groups(
data = PE_location_cham,
response_var = "PE_pred_run2_mg_per_g_mean",
group_var = "Location",
subfolder_name = "PE_Location_cham_B"
)
## For one-way between subjects designs, partial eta squared is equivalent
## to eta squared. Returning eta squared.
################################################Life_S
compare_groups(
data = PE_location_cham,
response_var = "PE_mg_per_g_sample_mean",
group_var = "Life_S",
subfolder_name = "PE_Life_S_cham_A"
)
## For one-way between subjects designs, partial eta squared is equivalent
## to eta squared. Returning eta squared.
compare_groups(
data = PE_location_cham,
response_var = "PE_pred_run2_mg_per_g_mean",
group_var = "Life_S",
subfolder_name = "PE_Life_S_cham_B"
)
## For one-way between subjects designs, partial eta squared is equivalent
## to eta squared. Returning eta squared.
###########################################Variety
PE_paracas_marcona <- PErep_final %>%
filter(Location %in% c("Mendieta", "7H", "Caro Caido"))
compare_groups(
data = PE_paracas_marcona,
response_var = "PE_mg_per_g_sample_mean",
group_var = "variety",
subfolder_name = "PE_variety_A"
)
compare_groups(
data = PE_location_cham,
response_var = "PE_pred_run2_mg_per_g_mean",
group_var = "variety",
subfolder_name = "PE_variety_B"
)
#################################Preperation method
PE_methods <- PErep_final %>%
filter(Location %in% c("Lima Market Freeze Dry", "Ilo Freeze Dry", "Ilo oven dry", "Ilo Fresh", "Lima Market Fresh", "Lima Market Oven Dry"))
compare_groups(
data = PE_methods,
response_var = "PE_mg_per_g_sample_mean",
group_var = "Location",
subfolder_name = "PE_method_A"
)
## For one-way between subjects designs, partial eta squared is equivalent
## to eta squared. Returning eta squared.
compare_groups(
data = PE_methods,
response_var = "PE_pred_run2_mg_per_g_mean",
group_var = "Location",
subfolder_name = "PE_method_B"
)
## For one-way between subjects designs, partial eta squared is equivalent
## to eta squared. Returning eta squared.
#################################Gam Cham
PE_gamtetra <- PE_location %>%
filter(Life_S %in% c("Gam/Tetra", "Gam", "Tetra"))
compare_groups(
data = PE_gamtetra,
response_var = "PE_mg_per_g_sample_mean",
group_var = "Location",
subfolder_name = "PE_gamtetra_location_A"
)
## For one-way between subjects designs, partial eta squared is equivalent
## to eta squared. Returning eta squared.
compare_groups(
data = PE_gamtetra,
response_var = "PE_pred_run2_mg_per_g_mean",
group_var = "Location",
subfolder_name = "PE_gamtetra_location_A"
)
## For one-way between subjects designs, partial eta squared is equivalent
## to eta squared. Returning eta squared.